{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "from feature_selection import filter_method as ft"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_breast_cancer\n",
    "data = load_breast_cancer()\n",
    "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
    "                  columns= np.append(data['feature_names'], ['target']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean radius</th>\n",
       "      <th>mean texture</th>\n",
       "      <th>mean perimeter</th>\n",
       "      <th>mean area</th>\n",
       "      <th>mean smoothness</th>\n",
       "      <th>mean compactness</th>\n",
       "      <th>mean concavity</th>\n",
       "      <th>mean concave points</th>\n",
       "      <th>mean symmetry</th>\n",
       "      <th>mean fractal dimension</th>\n",
       "      <th>...</th>\n",
       "      <th>worst texture</th>\n",
       "      <th>worst perimeter</th>\n",
       "      <th>worst area</th>\n",
       "      <th>worst smoothness</th>\n",
       "      <th>worst compactness</th>\n",
       "      <th>worst concavity</th>\n",
       "      <th>worst concave points</th>\n",
       "      <th>worst symmetry</th>\n",
       "      <th>worst fractal dimension</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>17.99</td>\n",
       "      <td>10.38</td>\n",
       "      <td>122.80</td>\n",
       "      <td>1001.0</td>\n",
       "      <td>0.11840</td>\n",
       "      <td>0.27760</td>\n",
       "      <td>0.3001</td>\n",
       "      <td>0.14710</td>\n",
       "      <td>0.2419</td>\n",
       "      <td>0.07871</td>\n",
       "      <td>...</td>\n",
       "      <td>17.33</td>\n",
       "      <td>184.60</td>\n",
       "      <td>2019.0</td>\n",
       "      <td>0.1622</td>\n",
       "      <td>0.6656</td>\n",
       "      <td>0.7119</td>\n",
       "      <td>0.2654</td>\n",
       "      <td>0.4601</td>\n",
       "      <td>0.11890</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20.57</td>\n",
       "      <td>17.77</td>\n",
       "      <td>132.90</td>\n",
       "      <td>1326.0</td>\n",
       "      <td>0.08474</td>\n",
       "      <td>0.07864</td>\n",
       "      <td>0.0869</td>\n",
       "      <td>0.07017</td>\n",
       "      <td>0.1812</td>\n",
       "      <td>0.05667</td>\n",
       "      <td>...</td>\n",
       "      <td>23.41</td>\n",
       "      <td>158.80</td>\n",
       "      <td>1956.0</td>\n",
       "      <td>0.1238</td>\n",
       "      <td>0.1866</td>\n",
       "      <td>0.2416</td>\n",
       "      <td>0.1860</td>\n",
       "      <td>0.2750</td>\n",
       "      <td>0.08902</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19.69</td>\n",
       "      <td>21.25</td>\n",
       "      <td>130.00</td>\n",
       "      <td>1203.0</td>\n",
       "      <td>0.10960</td>\n",
       "      <td>0.15990</td>\n",
       "      <td>0.1974</td>\n",
       "      <td>0.12790</td>\n",
       "      <td>0.2069</td>\n",
       "      <td>0.05999</td>\n",
       "      <td>...</td>\n",
       "      <td>25.53</td>\n",
       "      <td>152.50</td>\n",
       "      <td>1709.0</td>\n",
       "      <td>0.1444</td>\n",
       "      <td>0.4245</td>\n",
       "      <td>0.4504</td>\n",
       "      <td>0.2430</td>\n",
       "      <td>0.3613</td>\n",
       "      <td>0.08758</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>11.42</td>\n",
       "      <td>20.38</td>\n",
       "      <td>77.58</td>\n",
       "      <td>386.1</td>\n",
       "      <td>0.14250</td>\n",
       "      <td>0.28390</td>\n",
       "      <td>0.2414</td>\n",
       "      <td>0.10520</td>\n",
       "      <td>0.2597</td>\n",
       "      <td>0.09744</td>\n",
       "      <td>...</td>\n",
       "      <td>26.50</td>\n",
       "      <td>98.87</td>\n",
       "      <td>567.7</td>\n",
       "      <td>0.2098</td>\n",
       "      <td>0.8663</td>\n",
       "      <td>0.6869</td>\n",
       "      <td>0.2575</td>\n",
       "      <td>0.6638</td>\n",
       "      <td>0.17300</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>20.29</td>\n",
       "      <td>14.34</td>\n",
       "      <td>135.10</td>\n",
       "      <td>1297.0</td>\n",
       "      <td>0.10030</td>\n",
       "      <td>0.13280</td>\n",
       "      <td>0.1980</td>\n",
       "      <td>0.10430</td>\n",
       "      <td>0.1809</td>\n",
       "      <td>0.05883</td>\n",
       "      <td>...</td>\n",
       "      <td>16.67</td>\n",
       "      <td>152.20</td>\n",
       "      <td>1575.0</td>\n",
       "      <td>0.1374</td>\n",
       "      <td>0.2050</td>\n",
       "      <td>0.4000</td>\n",
       "      <td>0.1625</td>\n",
       "      <td>0.2364</td>\n",
       "      <td>0.07678</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 31 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
       "3        11.42         20.38           77.58      386.1          0.14250   \n",
       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
       "\n",
       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
       "0           0.27760          0.3001              0.14710         0.2419   \n",
       "1           0.07864          0.0869              0.07017         0.1812   \n",
       "2           0.15990          0.1974              0.12790         0.2069   \n",
       "3           0.28390          0.2414              0.10520         0.2597   \n",
       "4           0.13280          0.1980              0.10430         0.1809   \n",
       "\n",
       "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
       "0                 0.07871   ...            17.33           184.60      2019.0   \n",
       "1                 0.05667   ...            23.41           158.80      1956.0   \n",
       "2                 0.05999   ...            25.53           152.50      1709.0   \n",
       "3                 0.09744   ...            26.50            98.87       567.7   \n",
       "4                 0.05883   ...            16.67           152.20      1575.0   \n",
       "\n",
       "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
       "0            0.1622             0.6656           0.7119                0.2654   \n",
       "1            0.1238             0.1866           0.2416                0.1860   \n",
       "2            0.1444             0.4245           0.4504                0.2430   \n",
       "3            0.2098             0.8663           0.6869                0.2575   \n",
       "4            0.1374             0.2050           0.4000                0.1625   \n",
       "\n",
       "   worst symmetry  worst fractal dimension  target  \n",
       "0          0.4601                  0.11890     0.0  \n",
       "1          0.2750                  0.08902     0.0  \n",
       "2          0.3613                  0.08758     0.0  \n",
       "3          0.6638                  0.17300     0.0  \n",
       "4          0.2364                  0.07678     0.0  \n",
       "\n",
       "[5 rows x 31 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((455, 30), (114, 30))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
    "                                                    data.target, test_size=0.2,\n",
    "                                                    random_state=0)\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Variance method\n",
    "removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0  variables are found to be almost constant\n"
     ]
    }
   ],
   "source": [
    "# the original dataset has no constant variable\n",
    "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0    0.923077\n",
       "0.0    0.068132\n",
       "2.0    0.008791\n",
       "Name: dummy, dtype: float64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lets create a duumy variable that help us do the demonstration\n",
    "X_train['dummy'] = np.floor(X_train['worst smoothness']*10)\n",
    "# variable dummy has> 92% of the observations show one value, 1.0\n",
    "X_train.dummy.value_counts() / np.float(len(X_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1  variables are found to be almost constant\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "['dummy']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)\n",
    "quasi_constant_feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(455, 30)\n"
     ]
    }
   ],
   "source": [
    "# drop that variable\n",
    "X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)\n",
    "print(X_train.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation method\n",
    "remove features that are highly correlated with each other"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "          feature1         feature2      corr\n",
      "0   mean perimeter      mean radius  0.998185\n",
      "6   mean perimeter        mean area  0.986692\n",
      "14  mean perimeter  worst perimeter  0.970507\n",
      "19  mean perimeter     worst radius  0.969520\n",
      "33  mean perimeter       worst area  0.941920 \n",
      "\n",
      "           feature1      feature2      corr\n",
      "12  perimeter error  radius error  0.978323\n",
      "30  perimeter error    area error  0.944995 \n",
      "\n",
      "          feature1             feature2      corr\n",
      "36  mean concavity  mean concave points  0.914627 \n",
      "\n",
      "        feature1       feature2      corr\n",
      "38  mean texture  worst texture  0.908182 \n",
      "\n",
      "                feature1             feature2      corr\n",
      "40  worst concave points  mean concave points  0.906312 \n",
      "\n"
     ]
    }
   ],
   "source": [
    "corr = ft.corr_feature_detect(data=X_train,threshold=0.9)\n",
    "# print all the correlated feature groups!\n",
    "for i in corr:\n",
    "    print(i,'\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "then we can decide which ones to remove."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mutual Information Filter\n",
    "Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# select the top 3 features\n",
    "mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)\n",
    "print(mi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['mean perimeter', 'mean concave points', 'worst radius',\n",
      "       'worst perimeter', 'worst area', 'worst concave points'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# select the top 20% features\n",
    "mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)\n",
    "print(mi)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chi-Square Filter\n",
    "Compute chi-squared stats between each non-negative feature and class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['mean area', 'area error', 'worst area'], dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# select the top 3 features\n",
    "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)\n",
    "print(chi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['mean perimeter', 'mean area', 'area error', 'worst radius',\n",
      "       'worst perimeter', 'worst area'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "# select the top 20% features\n",
    "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)\n",
    "print(chi)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Univariate ROC-AUC or MSE\n",
    "builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "worst perimeter            0.917275\n",
      "worst area                 0.895840\n",
      "worst radius               0.893458\n",
      "worst concave points       0.863131\n",
      "mean concavity             0.856939\n",
      "mean radius                0.849000\n",
      "mean area                  0.839314\n",
      "worst concavity            0.831375\n",
      "mean perimeter             0.829628\n",
      "mean concave points        0.826453\n",
      "area error                 0.812321\n",
      "worst compactness          0.742299\n",
      "radius error               0.740235\n",
      "mean compactness           0.734360\n",
      "perimeter error            0.680534\n",
      "worst texture              0.647666\n",
      "worst fractal dimension    0.640997\n",
      "concavity error            0.640203\n",
      "worst symmetry             0.620991\n",
      "concave points error       0.618133\n",
      "compactness error          0.607336\n",
      "mean symmetry              0.591775\n",
      "mean texture               0.573357\n",
      "texture error              0.568593\n",
      "worst smoothness           0.565100\n",
      "mean smoothness            0.557637\n",
      "fractal dimension error    0.542077\n",
      "smoothness error           0.522706\n",
      "symmetry error             0.493649\n",
      "mean fractal dimension     0.475548\n",
      "dtype: float64\n",
      "11 out of the 30 featues are kept\n",
      "mean radius             0.849000\n",
      "mean perimeter          0.829628\n",
      "mean area               0.839314\n",
      "mean concavity          0.856939\n",
      "mean concave points     0.826453\n",
      "area error              0.812321\n",
      "worst radius            0.893458\n",
      "worst perimeter         0.917275\n",
      "worst area              0.895840\n",
      "worst concavity         0.831375\n",
      "worst concave points    0.863131\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,\n",
    "                                   X_test=X_test,y_test=y_test,threshold=0.8)\n",
    "print(uni_roc_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mean fractal dimension     0.491228\n",
      "symmetry error             0.480750\n",
      "fractal dimension error    0.456140\n",
      "smoothness error           0.449561\n",
      "texture error              0.412281\n",
      "worst smoothness           0.403265\n",
      "mean smoothness            0.399123\n",
      "mean texture               0.396930\n",
      "mean symmetry              0.363060\n",
      "compactness error          0.361842\n",
      "concave points error       0.357456\n",
      "worst fractal dimension    0.355263\n",
      "worst symmetry             0.350877\n",
      "worst texture              0.333333\n",
      "concavity error            0.333333\n",
      "perimeter error            0.300439\n",
      "mean compactness           0.258772\n",
      "worst compactness          0.254386\n",
      "radius error               0.245614\n",
      "area error                 0.179825\n",
      "mean perimeter             0.166667\n",
      "mean concave points        0.166667\n",
      "worst concavity            0.162281\n",
      "mean radius                0.146930\n",
      "mean concavity             0.142544\n",
      "mean area                  0.140351\n",
      "worst concave points       0.123782\n",
      "worst area                 0.103070\n",
      "worst radius               0.100877\n",
      "worst perimeter            0.098684\n",
      "dtype: float64\n",
      "6 out of the 30 featues are kept\n",
      "mean fractal dimension     0.491228\n",
      "texture error              0.412281\n",
      "smoothness error           0.449561\n",
      "symmetry error             0.480750\n",
      "fractal dimension error    0.456140\n",
      "worst smoothness           0.403265\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,\n",
    "                            X_test=X_test,y_test=y_test,threshold=0.4)\n",
    "print(uni_mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
